1 R Installation

Install packages:

source("http://bioconductor.org/biocLite.R")
biocLite(c("supraHex","devtools","dnet","XGR","dplyr"))
devtools::install_github(c("hfang-bristol/supraHex", "hfang-bristol/dnet", "hfang-bristol/XGR"))

2 Map Collections

A collection of aesthetic maps supported for the self-organised representation and comparison: (from top-left to bottom-right) a supra-hexagonal map, a ring map, a diamond map, a trefoil map, a butterfly map, an hourglass map, a ladder map, a bridge map, and a triangle map.

library(supraHex)
library(ggplot2)
colormap <- 'lightyellow-lightblue'
color <- 'black'

# For "supraHex" shape itself
sHex <- sHexGridVariant(r=6, shape="suprahex")
df_polygon <- sHexPolygon(sHex)
df_coord <- data.frame(sHex$coord, index=1:nrow(sHex$coord))
gp_suprahex <- ggplot(data=df_polygon, aes(x,y,group=index)) + geom_polygon(aes(fill=factor(stepCentroid%%2))) + coord_fixed(ratio=1) + theme_void() + theme(legend.position="none") + scale_fill_manual(values=visColormap(colormap)(2)) + geom_text(data=df_coord, aes(x,y,label=index), color=color, size=2.5) + theme(plot.margin=unit(rep(0,4),rep("lines",4)))

# For "triangle" shape
sHex <- sHexGridVariant(r=6, shape="triangle")
df_polygon <- sHexPolygon(sHex)
df_coord <- data.frame(sHex$coord, index=1:nrow(sHex$coord))
gp_triangle <- ggplot(data=df_polygon, aes(x,y,group=index)) + geom_polygon(aes(fill=factor(stepCentroid%%2))) + coord_fixed(ratio=1) + theme_void() + theme(legend.position="none") + scale_fill_manual(values=visColormap(colormap)(2)) + geom_text(data=df_coord, aes(x,y,label=index), color=color, size=2.5) + theme(plot.margin=unit(rep(0,4),rep("lines",4)))

# For "diamond" shape
sHex <- sHexGridVariant(r=6, shape="diamond")
df_polygon <- sHexPolygon(sHex)
df_coord <- data.frame(sHex$coord, index=1:nrow(sHex$coord))
gp_diamond <- ggplot(data=df_polygon, aes(x,y,group=index)) + geom_polygon(aes(fill=factor(stepCentroid%%2))) + coord_fixed(ratio=1) + theme_void() + theme(legend.position="none") + scale_fill_manual(values=visColormap(colormap)(2)) + geom_text(data=df_coord, aes(x,y,label=index), color=color, size=2.5) + theme(plot.margin=unit(rep(0,4),rep("lines",4)))

# For "hourglass" shape
sHex <- sHexGridVariant(r=6, shape="hourglass")
df_polygon <- sHexPolygon(sHex)
df_coord <- data.frame(sHex$coord, index=1:nrow(sHex$coord))
gp_hourglass <- ggplot(data=df_polygon, aes(x,y,group=index)) + geom_polygon(aes(fill=factor(stepCentroid%%2))) + coord_fixed(ratio=1) + theme_void() + theme(legend.position="none") + scale_fill_manual(values=visColormap(colormap)(2)) + geom_text(data=df_coord, aes(x,y,label=index), color=color, size=2.5) + theme(plot.margin=unit(rep(0,4),rep("lines",4)))

# For "trefoil" shape
sHex <- sHexGridVariant(r=6, shape="trefoil")
df_polygon <- sHexPolygon(sHex)
df_coord <- data.frame(sHex$coord, index=1:nrow(sHex$coord))
gp_trefoil <- ggplot(data=df_polygon, aes(x,y,group=index)) + geom_polygon(aes(fill=factor(stepCentroid%%2))) + coord_fixed(ratio=1) + theme_void() + theme(legend.position="none") + scale_fill_manual(values=visColormap(colormap)(2)) + geom_text(data=df_coord, aes(x,y,label=index), color=color, size=2.5) + theme(plot.margin=unit(rep(0,4),rep("lines",4)))

# For "ladder" shape
sHex <- sHexGridVariant(r=6, shape="ladder")
df_polygon <- sHexPolygon(sHex)
df_coord <- data.frame(sHex$coord, index=1:nrow(sHex$coord))
gp_ladder <- ggplot(data=df_polygon, aes(x,y,group=index)) + geom_polygon(aes(fill=factor(stepCentroid%%2))) + coord_fixed(ratio=1) + theme_void() + theme(legend.position="none") + scale_fill_manual(values=visColormap(colormap)(2)) + geom_text(data=df_coord, aes(x,y,label=index), color=color, size=2.5) + theme(plot.margin=unit(rep(0,4),rep("lines",4)))

# For "butterfly" shape
sHex <- sHexGridVariant(r=6, shape="butterfly")
df_polygon <- sHexPolygon(sHex)
df_coord <- data.frame(sHex$coord, index=1:nrow(sHex$coord))
gp_butterfly <- ggplot(data=df_polygon, aes(x,y,group=index)) + geom_polygon(aes(fill=factor(stepCentroid%%2))) + coord_fixed(ratio=1) + theme_void() + theme(legend.position="none") + scale_fill_manual(values=visColormap(colormap)(2)) + geom_text(data=df_coord, aes(x,y,label=index), color=color, size=2.5) + theme(plot.margin=unit(rep(0,4),rep("lines",4)))

# For "ring" shape
sHex <- sHexGridVariant(r=6, shape="ring")
df_polygon <- sHexPolygon(sHex)
df_coord <- data.frame(sHex$coord, index=1:nrow(sHex$coord))
gp_ring <- ggplot(data=df_polygon, aes(x,y,group=index)) + geom_polygon(aes(fill=factor(stepCentroid%%2))) + coord_fixed(ratio=1) + theme_void() + theme(legend.position="none") + scale_fill_manual(values=visColormap(colormap)(2)) + geom_text(data=df_coord, aes(x,y,label=index), color=color, size=2.5) + theme(plot.margin=unit(rep(0,4),rep("lines",4)))

# For "bridge" shape
sHex <- sHexGridVariant(r=6, shape="bridge")
df_polygon <- sHexPolygon(sHex)
df_coord <- data.frame(sHex$coord, index=1:nrow(sHex$coord))
gp_bridge <- ggplot(data=df_polygon, aes(x,y,group=index)) + geom_polygon(aes(fill=factor(stepCentroid%%2))) + coord_fixed(ratio=1) + theme_void() + theme(legend.position="none") + scale_fill_manual(values=visColormap(colormap)(2)) + geom_text(data=df_coord, aes(x,y,label=index), color=color, size=2.5) + theme(plot.margin=unit(rep(0,4),rep("lines",4)))

gridExtra::grid.arrange(grobs=list(gp_suprahex,gp_ring,gp_diamond,gp_trefoil,gp_butterfly,gp_hourglass,gp_ladder,gp_bridge,gp_triangle), layout_matrix=rbind(c(1,1,2,2,3),c(1,1,2,2,3),c(4,4,5,5,6),c(4,4,5,5,6),c(7,7,8,8,9)), nrow=5, ncol=5)

3 Showcase-I

In this showcase, we analyse TF and DHS datasets in K562, and compare TF datasets between K562 and Gm12878.

First of all, load the packages:

library(supraHex)
library(XGR)

# optionally, specify the local location of built-in RData
# by default:
RData.location <- "http://galahad.well.ox.ac.uk/bigdata_dev"

Then, import TF and DHS datasets:

# extract TFBSs, stored in a list of GR objects
ENCODE_TFBS_ClusteredV3_CellTypes <- xRDataLoader(RData.customised='ENCODE_TFBS_ClusteredV3_CellTypes', RData.location=RData.location)
## common to Gm12878 and K562
anno_gr_Gm12878 <- ENCODE_TFBS_ClusteredV3_CellTypes$GM12878
anno_gr_K562 <- ENCODE_TFBS_ClusteredV3_CellTypes$K562
names_common <- sort(intersect(names(anno_gr_Gm12878), names(anno_gr_K562)))
ind <- match(names_common, names(anno_gr_Gm12878))
TF_gr_Gm12878 <- anno_gr_Gm12878[ind]
ind <- match(names_common, names(anno_gr_K562))
TF_gr_K562 <- anno_gr_K562[ind]

# extract DHS, stored in a list of GR objects
ENCODE_DNaseI_ClusteredV3_CellTypes <- xRDataLoader(RData.customised='ENCODE_DNaseI_ClusteredV3_CellTypes', RData.location=RData.location)
## in K562
ind <- grep("K562", names(ENCODE_DNaseI_ClusteredV3_CellTypes))
DHS_gr_K562 <- ENCODE_DNaseI_ClusteredV3_CellTypes[ind]

3.1 Trained TF map in K562

Calculate gene-centric TF association scores:

anno_gr <- TF_gr_K562
res_ls <- lapply(1:length(anno_gr), function(i){
    message(sprintf("Analysing %d '%s' (%s)", i, names(anno_gr)[i], as.character(Sys.time())), appendLF=T)
    df_xGenes <- xGR2xGenes(anno_gr[[i]], format="GRanges", crosslink="genehancer", cdf.function="empirical", scoring=T, scoring.scheme="max", RData.location=RData.location, verbose=F)
    data.frame(df_xGenes, TF=rep(names(anno_gr)[i],nrow(df_xGenes)), stringsAsFactors=F)
})
df <- do.call(rbind, res_ls)
G2TF_K562 <- as.matrix(xSparseMatrix(df[,c("Gene","TF","Score")]))

Justify the use of the ladder-shaped map:

data <- G2TF_K562
df <- as.data.frame(stats::prcomp(data)$x[,1:2])
gp <- ggplot(df, aes(x=PC1, y=PC2)) 
gp <- gp + theme_bw() + theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank())
## 2d shape plot
myColor <- xColormap(colormap="spectral")(11)
gp <- gp + stat_binhex(bins=48) + scale_fill_gradientn(colours=myColor,trans="sqrt") + geom_density2d(colour="grey")
print(gp)

Train the ladder-shaped map using cross-TF gene scores:

data <- G2TF_K562
sMap <- sPipeline(data, shape="ladder", finetuneSustain=T)

Build neighbour-joining TF tree:

D <- t(sMap$codebook)
set.seed(825)
tree_bs <- visTreeBootstrap(D, num.bootstrap=1000, nodelabels.arg=list(frame="circle",cex=0.35,col="black"), plot.phylo.arg=list(type=c("phylogram","cladogram","fan","radial")[1],cex=0.6,node.pos=1))

Visualise TF map (reordered by tree):

tree_bs_ind <- match(tree_bs$tip.label, rownames(D))
xMap <- sMap
xMap$codebook <- xMap$codebook[,tree_bs_ind]
colormap <- xColormap("jet.top")
visHexMulComp(xMap, which.components=51:1, rect.grid=c(7,8), title.rotate=0, title.xy=c(0.3,1.05), colormap=colormap, zlim=c(0,1), gp=grid::gpar(cex=0.6), newpage=F)

3.2 Overlaid DHS map in K562

Calculate gene-centric DHS association scores:

anno_gr <- DHS_gr_K562
res_ls <- lapply(1:length(anno_gr), function(i){
    message(sprintf("Analysing %d '%s' (%s)", i, names(anno_gr)[i], as.character(Sys.time())), appendLF=T)
    df_xGenes <- xGR2xGenes(anno_gr[[i]], format="GRanges", crosslink="genehancer", cdf.function="empirical", scoring=T, scoring.scheme="max", RData.location=RData.location, verbose=T)
    data.frame(df_xGenes, Cell=rep(names(anno_gr)[i],nrow(df_xGenes)), stringsAsFactors=F)
})
df <- do.call(rbind, res_ls)
G2DHS <- as.matrix(xSparseMatrix(df[,c("Gene","Cell","Score")]))

The DHS map is obtained by overlaying the DHS data onto the trained TF map in K562:

ind <- match(rownames(G2TF_K562), rownames(G2DHS))
additional <- G2DHS[ind,]
additional[is.na(additional)] <- 0
additional <- matrix(additional, ncol=1)
rownames(additional) <- rownames(G2TF_K562)
colnames(additional) <- 'DHS'
sOverlay_DHS <- sMapOverlay(sMap, data=G2TF_K562, additional=additional)

Visualise DHS map:

colormap <- xColormap("jet.top")
visHexMulComp(sOverlay_DHS, rect.grid=c(1,1), title.rotate=0, colormap=colormap, zlim=c(0,1), gp=grid::gpar(cex=1), newpage=F)

Rankings of 51 TFs in terms of similarity (Pearson correlation) to the DHS map in K562:

M_K562 <- sMap$codebook
M_DHS <- sOverlay_DHS$codebook
y <- M_DHS[,1]
names(y) <- 1:length(y)
ls_df <- lapply(1:ncol(M_K562), function(i){
    x <- data.frame(name=1:nrow(M_K562), M_K562[,i], stringsAsFactors=F)
    ls_res <- xCorrelation(x, y, method="pearson", p.type="nominal")
    df_summary <- ls_res$df_summary
    data.frame(TF=colnames(M_K562)[i], cor=df_summary$cor, stringsAsFactors=F)
})
df <- do.call(rbind, ls_df)
df$rank <- rank(-1*df$cor,ties.method="first")
df <- df %>% dplyr::arrange(rank)

## polar plot
gp_DHS <- xPolarDot(df, parallel=T, signature=F)
gp_DHS + labs(y="Pearson correlation") + ylim(0,1)

3.3 Overlaid TF map in Gm12878

Calculate gene-centric TF association scores:

anno_gr <- TF_gr_Gm12878
res_ls <- lapply(1:length(anno_gr), function(i){
    message(sprintf("Analysing %d '%s' (%s)", i, names(anno_gr)[i], as.character(Sys.time())), appendLF=T)
    df_xGenes <- xGR2xGenes(anno_gr[[i]], format="GRanges", crosslink="genehancer", cdf.function="empirical", scoring=T, scoring.scheme="max", RData.location=RData.location, verbose=F)
    data.frame(df_xGenes, TF=rep(names(anno_gr)[i],nrow(df_xGenes)), stringsAsFactors=F)
})
df <- do.call(rbind, res_ls)
G2TF_Gm12878 <- as.matrix(xSparseMatrix(df[,c("Gene","TF","Score")]))

The TF map is obtained by overlaying the TF data in Gm12878 onto the trained TF map in K562:

ind <- match(rownames(G2TF_K562), rownames(G2TF_Gm12878))
additional <- G2TF_Gm12878[ind,]
additional[is.na(additional)] <- 0
sOverlay_Gm12878 <- sMapOverlay(sMap, data=G2TF_K562, additional=additional)

Rankings of 51 TFs in terms of similarity (Pearson correlation) in two cell lines (K562 and Gm12878):

M_K562 <- sMap$codebook
M_Gm12878 <- sOverlay_Gm12878$codebook
ls_df <- lapply(1:ncol(M_K562), function(i){
    x <- data.frame(name=1:nrow(M_K562), M_K562[,i], stringsAsFactors=F)
    y <- M_Gm12878[,i]
    names(y) <- 1:nrow(M_Gm12878)
    ls_res <- xCorrelation(x, y, method="pearson", p.type="nominal")
    df_summary <- ls_res$df_summary
    data.frame(TF=colnames(M_K562)[i], cor=df_summary$cor, stringsAsFactors=F)
})
df <- do.call(rbind, ls_df)
df$rank <- rank(-1*df$cor,ties.method="first")
df <- df %>% dplyr::arrange(rank)

## polar plot
gp_twin <- xPolarDot(df, parallel=T, signature=F)
gp_twin + labs(y="Pearson correlation") + ylim(0,1)

Visualise TF map (reordered by correlation):

## yMap
yMap <- sMap
ind <- match(df$TF, colnames(yMap$codebook))
yMap$codebook <- yMap$codebook[,ind]
## zMap
zMap <- sOverlay_Gm12878
ind <- match(df$TF, colnames(zMap$codebook))
zMap$codebook <- zMap$codebook[,ind]
## xMap
xMap <- sMap
nr <- nrow(xMap$codebook)
nc <- ncol(xMap$codebook)
matrix.combined <- matrix(NA, nrow=nr, ncol=nc*2)
matrix.combined[, seq(1,nc*2,2)] <- yMap$codebook
matrix.combined[, seq(2,nc*2,2)] <- zMap$codebook
xMap$codebook <- matrix.combined
colnames(xMap$codebook) <- rep(colnames(yMap$codebook), each=2)
## visualisation
colormap <- xColormap("jet.top")
visHexMulComp(xMap, which.components=1:(nc*2), rect.grid=c(13,8), title.rotate=0, title.xy=c(0.3,1), colormap=colormap, zlim=c(0,1), gp=grid::gpar(cex=0.4), newpage=F)

Perform enrichment analysis for TFs:

## TFs (high correlation)
data_high <- subset(df, rank<=25)$TF
## TFs (low correlation)
data_low <- subset(df, rank>=27)$TF

## enrichment analysis
data <- list(high=data_high, low=data_low)
ls_eTerm <- xEnricherGenesAdv(data, ontologies="SF", ontology.algorithm="none", RData.location=RData.location)

## visualisation
gp <- xEnrichForest(ls_eTerm, FDR.cutoff=0.01, zlim=c(2,10), signature=F)
gp + theme(axis.text.y=element_text(size=10))

Enriched domains for TFs

ls_eTerm$df

4 Showcase-II

In this showcase, we analyse TF and CRISPR datasets in K562.

First of all, load the packages:

library(supraHex)
library(XGR)

# optionally, specify the local location of built-in RData
# by default:
RData.location <- "http://galahad.well.ox.ac.uk/bigdata_dev"

Then, import TF and CRISPR datasets in K562:

# extract TF, stored in a list of GR objects
ENCODE_TFBS_ClusteredV3_CellTypes <- xRDataLoader(RData.customised='ENCODE_TFBS_ClusteredV3_CellTypes', RData.location=RData.location)
# calculate gene-centric TF association scores
anno_gr <- ENCODE_TFBS_ClusteredV3_CellTypes$K562
res_ls <- lapply(1:length(anno_gr), function(i){
    message(sprintf("Analysing %d '%s' (%s)", i, names(anno_gr)[i], as.character(Sys.time())), appendLF=T)
    df_xGenes <- xGR2xGenes(anno_gr[[i]], format="GRanges", crosslink="genehancer", cdf.function="empirical", scoring=T, scoring.scheme="max", RData.location=RData.location, verbose=F)
    data.frame(df_xGenes, TF=rep(names(anno_gr)[i],nrow(df_xGenes)), stringsAsFactors=F)
})
df <- do.call(rbind, res_ls)
mat_crispr <- as.matrix(xSparseMatrix(df[,c("Gene","TF","Score")]))

# import CRISPR
data.file <- file.path(RData.location,'TW_Science_CRISPR.txt')
data <- read.delim(data.file, header=TRUE, stringsAsFactors=FALSE)
data <- subset(data, K562.adjusted.p.value<0.05)

# extract genes commons to CRISPR and TF datasets in K562
ind <- match(data$Gene, rownames(mat_crispr))
G2TF_K562_crispr <- mat_crispr[ind[!is.na(ind)],]
G2CRISPR_K562 <- data.frame(CRISPR=data$K562.CS[!is.na(ind)], stringsAsFactors=F)
rownames(G2CRISPR_K562) <- data$Gene[!is.na(ind)]

4.1 Trained TF map in K562

Justify the use of the triangle-shaped map:

data <- G2TF_K562_crispr
df <- as.data.frame(stats::prcomp(data)$x[,1:2])
gp <- ggplot(df, aes(x=PC1, y=PC2)) 
gp <- gp + theme_bw() + theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank())
## 2d shape plot
myColor <- xColormap(colormap="spectral")(11)
gp <- gp + stat_binhex(bins=24) + scale_fill_gradientn(colours=myColor,trans="sqrt") + geom_density2d(colour="grey")
print(gp)

Train the triangle-shaped map using cross-TF gene scores:

data <- G2TF_K562_crispr
sM <- sPipeline(data, shape="triangle", finetuneSustain=T, scale=3)

4.2 Overlaid CRISPR map in K562

The CRISPR map is obtained by overlaying the CRISPR data onto the trained TF map in K562:

sOverlay_CRISPR <- sMapOverlay(sM, data=G2TF_K562_crispr, additional=G2CRISPR_K562)

Visualise CRISPR map:

colormap <- xColormap("ggplot2.bottom")
visHexMulComp(sOverlay_CRISPR, rect.grid=c(1,1), title.rotate=0, title.xy=c(0.4,1), colormap=colormap, zlim=c(-2.5,-1.5), gp=grid::gpar(cex=1), newpage=F)

Rankings of 100 TFs in terms of similarity (Pearson correlation) to the CRISPR map in K562:

M_K562 <- sM$codebook
M_CRISPR <- sOverlay_CRISPR$codebook
y <- M_CRISPR[,1]
names(y) <- 1:length(y)
ls_df <- lapply(1:ncol(M_K562), function(i){
    x <- data.frame(name=1:nrow(M_K562), M_K562[,i], stringsAsFactors=F)
    ls_res <- xCorrelation(x, y, method="pearson", p.type="nominal")
    df_summary <- ls_res$df_summary
    data.frame(TF=colnames(M_K562)[i], cor=df_summary$cor, stringsAsFactors=F)
})
df <- do.call(rbind, ls_df)
df$rank <- rank(-1*df$cor,ties.method="first")
df <- df %>% dplyr::arrange(rank)

## polar plot
gp_CRISPR <- xPolarDot(df, size=1.5, parallel=T, signature=F)
gp_CRISPR + labs(y="Pearson correlation") + scale_y_reverse(limits=c(0,-1))

Visualise TF map (reordered by correlation):

xMap <- sM
ind <- match(df$TF, colnames(xMap$codebook))
xMap$codebook <- xMap$codebook[,ind]
colormap <- xColormap("jet.top")
visHexMulComp(xMap, which.components=100:1, rect.grid=c(10,10), title.rotate=0, title.xy=c(0.3,1.05), colormap=colormap, zlim=c(0,1), gp=grid::gpar(cex=0.6), newpage=F)

4.3 Gene clusters

Obtain gene clusters based on TF map:

xM <- sM
ind <- match(df$TF, colnames(xM$codebook))
xM$codebook <- xM$codebook[,ind]
#seed <- sDmatMinima(xM)
sBase <- sDmatCluster(xM, which_neigh=1)
colormap <- xColormap("spectral")
visDmatCluster(xM, sBase, colormap=colormap, newpage=F)

Calculate CRISPR scores per gene cluster:

colormap <- xColormap("jet.top")
output <- visDmatHeatmap(xM, data=G2TF_K562_crispr[,ind], sBase, base.color=xColormap("spectral"), base.separated.arg=list(col="white",lty=1,lwd=1), base.legend.location="bottomleft", colormap=colormap, zlim=c(0,1), KeyValueName="TF-gene association", labRow=NA, srtCol=90, cexCol=0.6, keep.data=T)

## Polar barplot
ind <- match(output$ID, rownames(G2CRISPR_K562))
output$val <- G2CRISPR_K562[ind,]
ls_tmp <- split(x=output$val, f=output$Cluster_base)
ls_df <- lapply(1:length(ls_tmp), function(i){
    x <- ls_tmp[[i]]
    y <- median(x)
    data.frame(Cluster=paste0('C',names(ls_tmp)[i]), Val=y, Num=length(x), stringsAsFactors=F)
})
df <- do.call(rbind, ls_df)
gp <- xPolarBar(df, colormap='spectral', size.name=10, size.value=2.5, parallel=F, signature=F)
gp

5 Showcase-III

In this showcase, we analyse GTEx V6p tissue-specific eGene datasets and ExAC loss-of-function (LoF) datasets.

First of all, load the packages:

library(supraHex)
library(XGR)

# optionally, specify the local location of built-in RData
# by default:
RData.location <- "http://galahad.well.ox.ac.uk/bigdata_dev"

Then, import GTEx and ExAC datasets:

# import GTEx datasets
GTEx_V6p_matrix <- xRDataLoader(RData.customised='GTEx_V6p_matrix', RData.location=RData.location)
## brain or blood tissues
ind <- grepl('Brain|Blood', colnames(GTEx_V6p_matrix))
GTEx_V6p_matrix <- GTEx_V6p_matrix[,ind]
colnames(GTEx_V6p_matrix) <- gsub('Brain_','',colnames(GTEx_V6p_matrix))
## only significant eGenes (q-value<0.05)
GTEx_V6p_matrix <- -log10(GTEx_V6p_matrix)
ind <- apply(GTEx_V6p_matrix>-log10(0.05), 1, sum)
GTEx_matrix <- GTEx_V6p_matrix[ind!=0,]

# import ExAC datasets
ExAC <- xRDataLoader(RData.customised='ExAC', RData.location=RData.location)
ExAC_matrix <- data.frame(pLI=ExAC$pLI, stringsAsFactors=F)
rownames(ExAC_matrix) <- ExAC$Symbol
## ExAC LoF intolerance genes: binary (1 for yes, 0 for no)
ExAC_matrix[ExAC_matrix>0.9,1] <- 1
ExAC_matrix[ExAC_matrix<=0.9,1] <- 0

# extract genes commons to GTEx and ExAC datasets
ind <- match(rownames(ExAC_matrix), rownames(GTEx_matrix))
G2GTEx <- GTEx_matrix[ind[!is.na(ind)],]
G2ExAC <- data.frame(pLI=ExAC_matrix[!is.na(ind),], stringsAsFactors=F)
rownames(G2ExAC) <- rownames(ExAC_matrix)[!is.na(ind)]

5.1 Trained GTEx tissue map

Justify the use of the diamond-shaped map:

data <- G2GTEx
df <- as.data.frame(stats::prcomp(data)$x[,1:2])
gp <- ggplot(df, aes(x=PC1, y=PC2)) 
gp <- gp + theme_bw() + theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank())
## 2d shape plot
myColor <- xColormap(colormap="spectral")(11)
gp <- gp + stat_binhex(bins=32) + scale_fill_gradientn(colours=myColor,trans="sqrt") + geom_density2d(colour="grey")
print(gp)

Train the diamond-shaped map using GTEx eGene datasets:

data <- G2GTEx
sMg <- sPipeline(data, shape="diamond", finetuneSustain=T, scale=3)

5.2 Overlaid ExAC map

The ExAC map is obtained by overlaying the ExAC data onto the trained GTEx map:

sOverlay_ExAC <- sMapOverlay(sMg, data=G2GTEx, additional=G2ExAC)

Visualise ExAC map:

colormap <- xColormap("jet.top")
visHexMulComp(sOverlay_ExAC, rect.grid=c(1,1), title.rotate=0, title.xy=c(0.4,1), colormap=colormap, zlim=c(0,0.4), gp=grid::gpar(cex=1), newpage=F)

Rankings of 11 tissues in terms of similarity (Pearson correlation) to the ExAC map:

M_GTEx <- sMg$codebook
M_ExAC <- sOverlay_ExAC$codebook
y <- M_ExAC[,1]
names(y) <- 1:length(y)
ls_df <- lapply(1:ncol(M_GTEx), function(i){
    x <- data.frame(name=1:nrow(M_GTEx), M_GTEx[,i], stringsAsFactors=F)
    ls_res <- xCorrelation(x, y, method="pearson", p.type="nominal")
    df_summary <- ls_res$df_summary
    data.frame(tissue=colnames(M_GTEx)[i], cor=df_summary$cor, stringsAsFactors=F)
})
df <- do.call(rbind, ls_df)
df$rank <- rank(-1*df$cor,ties.method="first")
df <- df %>% dplyr::arrange(rank)

## polar dotplot
df_tmp <- df
df_tmp$tissue <- gsub('_','\n',df_tmp$tissue)
gp_GTEx <- xPolarDot(df_tmp, size=1.5, parallel=T, signature=F)
gp_GTEx + labs(y="Pearson correlation") + scale_y_reverse(limits=c(0,-1))

Visualise GTEx map (reordered by correlation):

xMap <- sMg
ind <- match(df$tissue, colnames(xMap$codebook))
xMap$codebook <- xMap$codebook[,ind]
colormap <- xColormap("jet.top")
visHexMulComp(xMap, which.components=11:1, rect.grid=c(1,11), title.rotate=30, title.xy=c(0.3,1.05), colormap=colormap, zlim=c(0,6), gp=grid::gpar(cex=0.6), newpage=F)

5.3 Gene clusters

Obtain gene clusters based on GTEx tissue map:

xM <- sMg
ind <- match(df$tissue, colnames(xM$codebook))
xM$codebook <- xM$codebook[,ind]
#seed <- sDmatMinima(xM)
sBase <- sDmatCluster(xM, which_neigh=1)
colormap <- xColormap("spectral")
visDmatCluster(xM, sBase, colormap=colormap, newpage=F)

Calculate the probability of genes being LoF intolerant per gene cluster:

colormap <- xColormap("jet.top")
output_ExAC <- visDmatHeatmap(xM, data=G2GTEx[,ind], sBase, base.color=xColormap("spectral"), base.separated.arg=list(col="white",lty=1,lwd=1), base.legend.location="bottomleft", colormap=colormap, zlim=c(0,6), KeyValueName=expression(-log[10]("q-value")), labRow=NA, srtCol=90, cexCol=0.6, keep.data=T)

## Polar barplot
ind <- match(output_ExAC$ID, rownames(G2ExAC))
output_ExAC$val <- G2ExAC[ind,'pLI']
ls_tmp <- split(x=output_ExAC$val, f=output_ExAC$Cluster_base)
ls_df <- lapply(1:length(ls_tmp), function(i){
    x <- ls_tmp[[i]]
    y <- mean(x)
    data.frame(Cluster=paste0('C',names(ls_tmp)[i]), Val=y, Num=length(x), stringsAsFactors=F)
})
df <- do.call(rbind, ls_df)
gp <- xPolarBar(df, colormap='spectral', size.name=10, size.value=2.5, parallel=F, signature=F)
gp

Perform enrichment analysis for clustered genes:

ls_cluster <- split(x=output_ExAC$ID, f=output_ExAC$Cluster_base)
names(ls_cluster) <- paste0('C', names(ls_cluster))
background <- output_ExAC$ID
ontologies <- "CGL"
ls_res_CGL <- xEnricherGenesAdv(list_vec=ls_cluster, background=background, ontologies=ontologies, size.range=c(10,20000), test="hypergeo", min.overlap=5, p.tail="two-tails", RData.location=RData.location, plot=T, fdr.cutoff=0.05, displayBy="zscore")
#gp <- xEnrichForest(ls_res_CGL, top_num="auto", FDR.cutoff=0.05, signature=F)
ls_res_CGL$gp + theme(legend.title=element_text(size=8),axis.text.x=element_text(size=4))

6 Showcase-IV

In this showcase, we analyse haploid genetic screen datasets together with expression and ExAC loss-of-function (LoF) datasets.

First of all, load the packages:

library(supraHex)
library(XGR)

# optionally, specify the local location of built-in RData
# by default:
RData.location <- "http://galahad.well.ox.ac.uk/bigdata_dev"

Then, import Haploid and define combined datasets (expression and LoF):

# import haploid datasets (only positive regulators with negative mutation index)
Haploid <- xRDataLoader(RData.customised='Haploid_regulators', RData.location=RData.location)
## all phenotypes except 'HMGCR'
ind <- grepl('HMGCR', Haploid$Phenotype)
df <- Haploid[!ind,]
df <- df[df$MI<0,]
Haploid_matrix <- as.matrix(xSparseMatrix(df[,c('Gene','Phenotype','MI')]))

# number of phenotypes per gene
vec_count <- apply(Haploid_matrix!=0, 1, sum)
# median expression
EXP <- xRDataLoader(RData.customised='Haploid_expression', RData.location=RData.location)
vec_expression <- apply(EXP, 1, median)
# ExAC LoF intolerant
ExAC <- xRDataLoader(RData.customised='ExAC', RData.location=RData.location)
ExAC_matrix <- data.frame(pLI=ExAC$pLI, stringsAsFactors=F)
rownames(ExAC_matrix) <- ExAC$Symbol
ExAC_matrix[ExAC_matrix>0.9,1] <- 1
ExAC_matrix[ExAC_matrix<=0.9,1] <- 0
vec_lof <- ExAC_matrix[,1]
names(vec_lof) <- rownames(ExAC_matrix)
# combine three above
mat <- matrix(0, nrow=nrow(Haploid_matrix), ncol=3)
rownames(mat) <- rownames(Haploid_matrix)
colnames(mat) <- c('Count','Expression','LoF')
ind <- match(rownames(mat), names(vec_count))
mat[!is.na(ind),1] <- vec_count[ind[!is.na(ind)]]
ind <- match(rownames(mat), names(vec_expression))
mat[!is.na(ind),2] <- vec_expression[ind[!is.na(ind)]]
ind <- match(rownames(mat), names(vec_lof))
mat[!is.na(ind),3] <- vec_lof[ind[!is.na(ind)]]
Combined_matrix <- mat

# extract genes commons to Haploid and Combined datasets
ind <- match(rownames(Combined_matrix), rownames(Haploid_matrix))
G2Haploid <- Haploid_matrix[ind[!is.na(ind)],]
G2Combined <- data.frame(Combined_matrix[!is.na(ind),], stringsAsFactors=F)
rownames(G2Combined) <- rownames(Combined_matrix)[!is.na(ind)]

6.1 Trained Haploid protein map

Train the trefoil-shaped map using Haploid genetic regulator datasets:

sMh <- sPipeline(data=G2Haploid, shape="trefoil", finetuneSustain=T, scale=3)

6.2 Overlaid map

The map is obtained by overlaying the combined datasets onto the trained Haploid protein map:

sO_Combined <- sMapOverlay(sMh, data=G2Haploid, additional=G2Combined)

Visualise overlaid count map:

colormap <- xColormap("jet.top")
visHexMulComp(sO_Combined, which.components=1, rect.grid=c(1,1), title.rotate=0, title.xy=c(0.4,1), colormap=colormap, zlim=c(1,5), gp=grid::gpar(cex=1), newpage=F)

Visualise overlaid expression map:

colormap <- xColormap("jet.top")
visHexMulComp(sO_Combined, which.components=2, rect.grid=c(1,1), title.rotate=0, title.xy=c(0.4,1), colormap=colormap, zlim=c(6,10), gp=grid::gpar(cex=1), newpage=F)

Visualise overlaid LoF map:

colormap <- xColormap("jet.top")
visHexMulComp(sO_Combined, which.components=3, rect.grid=c(1,1), title.rotate=0, title.xy=c(0.4,1), colormap=colormap, zlim=c(0.3,0.7), gp=grid::gpar(cex=1), newpage=F)

Visualise Haploid protein map in 2D landscape:

sReorder <- sCompReorder(sMh, metric="euclidean")
colormap <- colorRampPalette(c("blue","#007FFF","cyan"), interpolate="spline")
visCompReorder(sMh, sReorder, title.rotate=0, title.xy=c(0.4,0.95), colormap=colormap, zlim=c(-1,0), gp=grid::gpar(cex=0.6), newpage=F)

6.3 Gene clusters

Obtain gene clusters based on Haploid protein map:

xM <- sMh
#seed <- sDmatMinima(xM)
sBase <- sDmatCluster(xM, which_neigh=1)
colormap <- xColormap("spectral")
visDmatCluster(xM, sBase, colormap=colormap, newpage=F)

Calculate the overlaid average per gene cluster:

colormap <- xColormap("jet.both")
output_Combined <- visDmatHeatmap(xM, data=G2Haploid, sBase, base.color=xColormap("spectral"), base.separated.arg=list(col="white",lty=1,lwd=1), base.legend.location="bottomleft", colormap=colormap, zlim=c(-1,1), KeyValueName=expression(-log[2]("Mutation index")), labRow=NA, reorderRow="none", srtCol=90, cexCol=0.6, keep.data=T)

## Polar barplot
ind <- match(output_Combined$ID, rownames(G2Combined))
df_tmp <- cbind(output_Combined, G2Combined[ind,])
df <- as.data.frame(df_tmp %>% dplyr::mutate(Cluster=paste0('C',Cluster_base)) %>% dplyr::group_by(Cluster) %>% dplyr::summarise(Count=mean(Count), Expression=mean(Expression),LoF=mean(LoF)))
ls_gp <- lapply(2:ncol(df), function(i){
    gp <- xPolarBar(df[,c(1,i)], colormap='spectral', size.name=10, size.value=2.5, parallel=F, signature=F)
    gp + labs(title=colnames(df)[i])
})
gridExtra::grid.arrange(grobs=ls_gp, nrow=1, ncol=3)

Perform enrichment analysis for clustered genes:

ls_cluster <- split(x=output_Combined$ID, f=output_Combined$Cluster_base)
names(ls_cluster) <- paste0('C', names(ls_cluster))
background <- NULL
ontologies <- "REACTOME"
ls_res_reactome <- xEnricherGenesAdv(list_vec=ls_cluster, background=background, ontologies=ontologies, size.range=c(20,500), test="hypergeo", min.overlap=10, p.tail="one-tail", RData.location=RData.location, plot=T, fdr.cutoff=0.01, displayBy="zscore")
#ls_res_reactome$gp + theme(legend.title=element_text(size=8),axis.text.x=element_text(size=4))

df <- ls_res_reactome$df
df <- as.data.frame(df %>% dplyr::filter(adjp < 0.01))
mat <- as.matrix(xSparseMatrix(df[,c('name','group','zscore')], columns=names(ls_cluster)))
mat[mat==0] <- NA
gp <- xHeatmap(t(mat), reorder=c("none","row","col","both")[3], colormap="#E6F598-#ABDDA4-#66C2A5-#3288BD", zlim=c(2,10), na.color='transparent', size=2.5, x.rotate=30, x.text.size=5, y.text.size=6, barheight=5, legend.title='Z-score')
gp